Load Packages

library(here)
## here() starts at C:/Users/dinap/Desktop/Research/sustainable_communities
library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
library(raster)
## Loading required package: sp
library(rgdal)
## rgdal: version: 1.5-18, (SVN revision 1082)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.0.4, released 2020/01/28
## Path to GDAL shared files: C:/Users/dinap/Documents/R/win-library/4.0/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: C:/Users/dinap/Documents/R/win-library/4.0/rgdal/proj
## Linking to sp version:1.4-4
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(sp)
library(RColorBrewer)
library(knitr)

Import Data

#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015.csv"))
#select columns of interest, removed co2_hhs
dat  <- dat_raw[,c(4:12,14:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
#look at data
summary(dat)
##     AQI_Good       Bachelor_Over_25  Per_Poverty         Gini      
##  Min.   :0.01701   Min.   : 8.40    Min.   : 5.10   Min.   :36.49  
##  1st Qu.:0.75251   1st Qu.:17.00    1st Qu.:13.00   1st Qu.:43.15  
##  Median :0.79131   Median :21.40    Median :15.90   Median :45.13  
##  Mean   :0.78634   Mean   :23.37    Mean   :16.97   Mean   :45.25  
##  3rd Qu.:0.88055   3rd Qu.:28.20    3rd Qu.:19.30   3rd Qu.:47.06  
##  Max.   :0.99945   Max.   :65.50    Max.   :64.10   Max.   :59.00  
##  NA's   :17                                                        
##  non_migration    Per_Sev_Hous    Xstreamlengthimpaired Per_Avg_Land_Cov
##  Min.   :60.40   Min.   : 7.565   Min.   : 0.3649       Min.   : 0.00   
##  1st Qu.:82.50   1st Qu.:12.978   1st Qu.: 8.0551       1st Qu.:25.60   
##  Median :85.20   Median :15.059   Median :12.1582       Median :50.55   
##  Mean   :84.67   Mean   :15.667   Mean   :18.9924       Mean   :46.10   
##  3rd Qu.:87.50   3rd Qu.:17.506   3rd Qu.:22.2555       3rd Qu.:63.10   
##  Max.   :95.20   Max.   :32.368   Max.   :90.4849       Max.   :96.82   
##  NA's   :12      NA's   :28       NA's   :36            NA's   :28      
##  poor_health_percent Z_Water_Index      Index_Black      Index_Asian    
##  Min.   : 5.20       Min.   :0.00000   Min.   :0.1267   Min.   :0.0000  
##  1st Qu.:13.30       1st Qu.:0.02821   1st Qu.:0.4130   1st Qu.:0.4109  
##  Median :16.18       Median :0.06296   Median :0.4837   Median :0.4822  
##  Mean   :16.79       Mean   :0.09666   Mean   :0.4860   Mean   :0.4861  
##  3rd Qu.:19.80       3rd Qu.:0.12259   3rd Qu.:0.5592   3rd Qu.:0.5598  
##  Max.   :38.50       Max.   :0.60450   Max.   :0.8114   Max.   :0.8697  
##  NA's   :45          NA's   :37        NA's   :28       NA's   :28      
##   Index_Latino       GHG_Percap         UNEMPLOY         FOOD_INS    
##  Min.   :0.04181   Min.   :  4.276   Min.   :0.0203   Min.   : 6.00  
##  1st Qu.:0.28938   1st Qu.: 12.849   1st Qu.:0.0579   1st Qu.:11.60  
##  Median :0.36608   Median : 16.406   Median :0.0717   Median :13.50  
##  Mean   :0.36338   Mean   : 18.690   Mean   :0.0759   Mean   :13.95  
##  3rd Qu.:0.42962   3rd Qu.: 21.099   3rd Qu.:0.0881   3rd Qu.:15.60  
##  Max.   :0.92438   Max.   :166.428   Max.   :0.3562   Max.   :31.20  
##  NA's   :28        NA's   :30                         NA's   :28     
##    VIO_CRIME     
##  Min.   :   0.0  
##  1st Qu.: 176.6  
##  Median : 278.7  
##  Mean   : 308.4  
##  3rd Qu.: 405.0  
##  Max.   :1238.4  
##  NA's   :29
head(dat)
##                        AQI_Good Bachelor_Over_25 Per_Poverty  Gini
## Adjuntas, PR          0.9330234             18.1        64.1 57.04
## Aguadilla-Isabela, PR 0.9330234             19.7        53.1 51.94
## Alexander City, AL           NA             18.2        21.2 45.36
## Arecibo, PR           0.9330234             22.0        48.3 51.85
## Atmore, AL                   NA             12.1        23.8 48.44
## Bonham, TX                   NA             16.2        14.8 43.55
##                       non_migration Per_Sev_Hous Xstreamlengthimpaired
## Adjuntas, PR                     NA           NA                    NA
## Aguadilla-Isabela, PR            NA           NA                    NA
## Alexander City, AL             90.9           NA                    NA
## Arecibo, PR                      NA           NA                    NA
## Atmore, AL                     93.7           NA                    NA
## Bonham, TX                     80.7           NA                    NA
##                       Per_Avg_Land_Cov poor_health_percent Z_Water_Index
## Adjuntas, PR                        NA                  NA            NA
## Aguadilla-Isabela, PR               NA                  NA            NA
## Alexander City, AL                  NA                  NA            NA
## Arecibo, PR                         NA                  NA            NA
## Atmore, AL                          NA                  NA            NA
## Bonham, TX                          NA                  NA            NA
##                       Index_Black Index_Asian Index_Latino GHG_Percap UNEMPLOY
## Adjuntas, PR                   NA          NA           NA         NA   0.3562
## Aguadilla-Isabela, PR          NA          NA           NA         NA   0.2375
## Alexander City, AL             NA          NA           NA         NA   0.0980
## Arecibo, PR                    NA          NA           NA         NA   0.1649
## Atmore, AL                     NA          NA           NA         NA   0.1524
## Bonham, TX                     NA          NA           NA         NA   0.0641
##                       FOOD_INS VIO_CRIME
## Adjuntas, PR                NA        NA
## Aguadilla-Isabela, PR       NA        NA
## Alexander City, AL          NA        NA
## Arecibo, PR                 NA        NA
## Atmore, AL                  NA        NA
## Bonham, TX                  NA        NA
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))

Run Cluster GMM

#run mclust on 1 to 20 clusters
dat_mc <- Mclust(dat, G=c(1:20))
#print summary of model fit
summary(dat_mc)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 10 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -16206.34 886 485 -35704.23 -35902.49
## 
## Clustering table:
##   1   2   3   4   5   6   7   8   9  10 
##  50  59  53  62 150 129  51  66 142 124

Model comparison

#plot BIC scores of different models
plot(dat_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_mc, what='BIC', ylim=c(-37000,-35000))

#model type VVE chosen as best, with similar BIC scores for 9-12 clusters
#to do additional model comparisons, generate and compare VVE models with 9-12 clusters
dat_mc9 <- Mclust(dat, G=9, modelNames="VVE")
dat_mc10 <- Mclust(dat, G=10, modelNames="VVE")
dat_mc11 <- Mclust(dat, G=11, modelNames="VVE")
dat_mc12 <- Mclust(dat, G=12, modelNames="VVE")

#extract BIC scores
BICs<-c(dat_mc9$BIC[1],dat_mc10$BIC[1],dat_mc11$BIC[1],dat_mc12$BIC[1])
#plot BIC scores
plot(c(9:12),BICs, pch=16, ylab="BIC Score",xlab="Number of Clusters", type='o')

#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c(9:12),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),] 
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")
Model Comparison
clusters BIC delta_BIC weight
10 -35704.23 0.00000 1
12 -35731.46 27.23133 0
11 -35811.06 106.82569 0
9 -35818.70 114.46901 0

Extract and map cluster classifications

#extract classifications, e.g., which MSA belongs to which cluster, write to CSV
write.csv(dat_mc10$classification,file=here("results/cluster_assignment.csv"))
data<-read.csv("results/cluster_assignment.csv")

#import MSA boundaries shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa") 
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\dinap\Desktop\Research\sustainable_communities\data\MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings:  ALAND AWATER
#merge polygons with cluster IDs
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert cluster ID to factor
merged_clean$x_f <- as.factor(merged_clean$x)
#plot
spplot(merged_clean, "x_f", col.regions=brewer.pal(10,'Spectral'))

Examine how clusters score in different metrics

#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")

#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:17]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:17]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:17]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:17]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:17]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:17]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:17]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:17]))
c9m <- data.frame(nine=colMeans(subset(dat_clust, x=="9")[2:17]))
c10m <- data.frame(ten=colMeans(subset(dat_clust, x=="10")[2:17]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m, c9m, c10m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL
#boxplots of scores for clusters
par(mfrow=c(2,5), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")
boxplot(subset(dat_clust, x=="9")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 9")
boxplot(subset(dat_clust, x=="10")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 10")

par(mfrow=c(1,1))
#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$nine, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 9', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$ten, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 10', pch=16)
abline(v=0, lwd=2)

par(mfrow=c(1,1))

Dimension reduction

#run discriminant analysis on clusters
DR <- MclustDR(dat_mc10, lambda = 1) #setting lambda to 1 gives most separating directions
summary(DR)
## ----------------------------------------------------------------- 
## Dimension reduction for model-based clustering and classification 
## ----------------------------------------------------------------- 
## 
## Mixture model type: Mclust (VVE, 10) 
##         
## Clusters   n
##       1   50
##       2   59
##       3   53
##       4   62
##       5  150
##       6  129
##       7   51
##       8   66
##       9  142
##       10 124
## 
## Estimated basis vectors: 
##                            Dir1       Dir2       Dir3       Dir4      Dir5
## AQI_Good              -0.176001  0.0167446  0.2544551 -0.0041113 -0.199672
## Bachelor_Over_25       0.072840 -0.2897267  0.3135171 -0.5609009  0.421081
## Per_Poverty            0.022156  0.2216694  0.6106306 -0.1721047  0.485515
## Gini                  -0.018424 -0.1870715 -0.0774274 -0.0713467 -0.052922
## non_migration          0.122296  0.0306189 -0.1004147  0.4486234  0.108786
## Per_Sev_Hous          -0.096523 -0.2903113 -0.5171334 -0.2399188  0.140611
## Xstreamlengthimpaired  0.474859  0.2044703 -0.1971267  0.1582116  0.240913
## Per_Avg_Land_Cov      -0.253104 -0.3671406 -0.0276019  0.2069777 -0.338410
## poor_health_percent   -0.160339 -0.3429140 -0.0287426  0.2401105  0.097547
## Z_Water_Index         -0.751602  0.6152728 -0.1207784  0.0101883  0.270779
## Index_Black            0.012783  0.0364651 -0.1388193 -0.0924190  0.063046
## Index_Asian            0.030941 -0.1078755  0.1763719  0.0492666  0.086927
## Index_Latino           0.053899  0.0566457 -0.0075445  0.2151389 -0.028245
## GHG_Percap            -0.134751 -0.0033918  0.1659712  0.0513501  0.460770
## UNEMPLOY               0.016303 -0.0286237 -0.2089966  0.0738562  0.156282
## FOOD_INS               0.113812 -0.2359471  0.0577083  0.3324095 -0.089733
## VIO_CRIME             -0.151807 -0.0315984 -0.0236525  0.3006170  0.013413
##                            Dir6      Dir7       Dir8       Dir9
## AQI_Good               0.247547  0.396043  0.0874908 -0.0832171
## Bachelor_Over_25      -0.170414 -0.031725 -0.3655544  0.0382844
## Per_Poverty           -0.064981  0.201735 -0.6034751 -0.4234419
## Gini                  -0.079564 -0.351825  0.2765681 -0.1192998
## non_migration          0.292492 -0.165084 -0.1740190 -0.2571548
## Per_Sev_Hous           0.351465  0.364975  0.3325524 -0.2378273
## Xstreamlengthimpaired -0.438677  0.568038 -0.0946994 -0.0208607
## Per_Avg_Land_Cov      -0.188715  0.293543 -0.4090132  0.0051191
## poor_health_percent   -0.296444  0.183355  0.1365397  0.1702002
## Z_Water_Index         -0.218556  0.071558 -0.0198185  0.0456237
## Index_Black            0.105431 -0.086784 -0.1339241  0.4102571
## Index_Asian           -0.134440 -0.048924  0.0340747 -0.1862869
## Index_Latino           0.026287 -0.080645 -0.1474826 -0.1506777
## GHG_Percap             0.322789  0.095990 -0.0615401  0.1927214
## UNEMPLOY               0.388493  0.056520 -0.0461231  0.2111648
## FOOD_INS              -0.116806 -0.038561  0.1871896  0.5775028
## VIO_CRIME             -0.166452 -0.198262  0.0084581 -0.0539742
## 
##                 Dir1     Dir2     Dir3     Dir4     Dir5     Dir6     Dir7
## Eigenvalues  0.94731  0.67751  0.50022  0.43587  0.32617  0.18446  0.13214
## Cum. %      28.77575 49.35584 64.55074 77.79092 87.69873 93.30189 97.31587
##                 Dir8       Dir9
## Eigenvalues  0.06914   0.019223
## Cum. %      99.41608 100.000000
#plot eigenvalues
plot(c(1:17),DR$evalues,ylab="eigenvalues", xlab="component", type="o", pch=16)

#plot all pairs of combinations
plot(DR, what='pairs',symbols=c("1","2","3","4","5","6","7","8","9","10"),
     colors=c("red","blue","green","goldenrod2","violet","brown","black","grey30","slateblue2","seagreen3"))

#plot directions for variables
Directions <- data.frame(d1=DR$basis[,1],d2=DR$basis[,2],
                         d3=DR$basis[,3],d4=DR$basis[,4],
                         d5=DR$basis[,5],d6=DR$basis[,6],
                         d7=DR$basis[,7],d8=DR$basis[,8],
                         d9=DR$basis[,9],var=rownames(DR$basis))

par(mfrow=c(3,3))
dotchart(Directions$d1, labels=Directions$var, pch=16, main="Dir1", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d2, labels=Directions$var, pch=16, main="Dir2", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d3, labels=Directions$var, pch=16, main="Dir3", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d4, labels=Directions$var, pch=16, main="Dir4", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d5, labels=Directions$var, pch=16, main="Dir5", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d6, labels=Directions$var, pch=16, main="Dir6", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d7, labels=Directions$var, pch=16, main="Dir7", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d8, labels=Directions$var, pch=16, main="Dir8", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d9, labels=Directions$var, pch=16, main="Dir9", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)

par(mfrow=c(1,1))